# Computations
import numpy as np
import pandas as pd
# scipy
from scipy.stats import norm
# preprocessing
from sklearn import preprocessing
import re
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## WordCloud
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com
This data is fictional and it is created by IBM data scientists.
Categorical Parameters:
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| Education | Below College | College | Bachelor | Master | Doctor |
| Environment Satisfaction | Low | Medium | High | Very High | |
| Job Involvement | Low | Medium | High | Very High | |
| Job Satisfaction | Low | Medium | High | Very High | |
| Performance Rating | Low | Good | Excellent | Outstanding | |
| Relationship Satisfaction | Low | Medium | High | Very High | |
| WorkLife Balance | Bad | Good | Better | Best |
This can be encoded as follows,
Categorical_Dict = {'Education': {1:'Below College', 2:'College',3:'Bachelor', 4: 'Master', 5:'Doctor'},
'Environment Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Involvement': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Performance Rating': {1:'Low', 2:'Good', 3:'Excellent', 4:'Outstanding'},
'Relationship Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Work Life Balance': {1:'Bad', 2:'Good', 3:'Better', 4:'Best'}}
Data = pd.read_excel('Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx')
Temp = [re.sub(r"(\w)([A-Z])", r"\1 \2", x) for x in Data.columns]
Temp = [x.replace(' Curr ', ' Current ').replace('18',' 18').replace('Num ','Number Of ') for x in Temp]
Data.columns = Temp
del Temp
Data['Business Travel'] = Data['Business Travel'].str.replace('_',' ')
display(Data.head(8).style.hide_index())
Target = 'Attrition'
| Age | Attrition | Business Travel | Daily Rate | Department | Distance From Home | Education | Education Field | Employee Count | Employee Number | Environment Satisfaction | Gender | Hourly Rate | Job Involvement | Job Level | Job Role | Job Satisfaction | Marital Status | Monthly Income | Monthly Rate | Number Of Companies Worked | Over 18 | Over Time | Percent Salary Hike | Performance Rating | Relationship Satisfaction | Standard Hours | Stock Option Level | Total Working Years | Training Times Last Year | Work Life Balance | Years At Company | Years In Current Role | Years Since Last Promotion | Years With Current Manager |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | Yes | Travel Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 49 | No | Travel Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 37 | Yes | Travel Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 33 | No | Travel Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 27 | No | Travel Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| 32 | No | Travel Frequently | 1005 | Research & Development | 2 | 2 | Life Sciences | 1 | 8 | 4 | Male | 79 | 3 | 1 | Laboratory Technician | 4 | Single | 3068 | 11864 | 0 | Y | No | 13 | 3 | 3 | 80 | 0 | 8 | 2 | 2 | 7 | 7 | 3 | 6 |
| 59 | No | Travel Rarely | 1324 | Research & Development | 3 | 3 | Medical | 1 | 10 | 3 | Female | 81 | 4 | 1 | Laboratory Technician | 1 | Married | 2670 | 9964 | 4 | Y | Yes | 20 | 4 | 1 | 80 | 3 | 12 | 3 | 2 | 1 | 0 | 0 | 0 |
| 30 | No | Travel Rarely | 1358 | Research & Development | 24 | 1 | Life Sciences | 1 | 11 | 4 | Male | 67 | 3 | 1 | Laboratory Technician | 3 | Divorced | 2693 | 13335 | 1 | Y | No | 22 | 4 | 2 | 80 | 1 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
First off, let's take a look at the dataset
def Data_Plot(Inp, W = False):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info.index.name = 'Features'
data_info = data_info.reset_index(drop = False)
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if W:
fig.update_layout(width = W)
fig.show()
return data_info
_ = Data_Plot(Data)
A quick overview of data distribution:
_ = Data.hist(bins=30, grid=False, figsize=(18,18), color='#34495e', edgecolor='k', zorder=2, rwidth=0.8)
## Attrition Colormap
Att_Colors = ['LightSalmon', 'LightBlue']
Att_LC = 'Black'
# Gender Colormap
MF_Colors = ['HotPink', 'RoyalBlue']
MF_LC = 'Navy'
# Education
Ed_Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen']
Ed_LC = 'Black'
def DistPlot(Feat, yLim = [0, 80], H = 450, titleY = 0.92):
fig = px.histogram(Data, x = Feat, color='Attrition', marginal= 'box',
color_discrete_sequence= Att_Colors, hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
Name = '%s Distribution by Attrition' % Feat
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= H, width= 980,
title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= Att_LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=yLim)
fig.show()
def PlotX1(df, Feat, ColorFeat = 'Gender', yLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
# x_title = Feat,
y_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Left
fig1 = px.bar(df.loc[df.Attrition == 'No'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def PlotX2(df, Feat, ColorFeat = 'Education', yLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
# x_title = Feat,
y_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Left
fig1 = px.bar(df.loc[df.Attrition == 'No'], x= Feat, y= 'Percentage', orientation='v', barmode='group',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def PlotY1(df, Feat, ColorFeat = 'Gender', xLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.08, shared_yaxes=True,
# y_title = Feat,
x_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Top
fig1 = px.bar(df.loc[df.Attrition == 'No'], y= Feat, x= 'Percentage', orientation='h',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], y= Feat, x= 'Percentage', orientation='h',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=2, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def PlotY2(df, Feat, ColorFeat = 'Education', xLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.08, shared_yaxes=True,
# y_title = Feat,
x_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Top
fig1 = px.bar(df.loc[df.Attrition == 'No'], y= Feat, x= 'Percentage', orientation='h', barmode='group',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], y= Feat, x= 'Percentage', orientation='h',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=2, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
DistPlot(Feat = 'Age')
#
bins = pd.IntervalIndex.from_tuples([(0, 25), (25, 40), (40, 45),(45, 60)])
Temp = Data[['Gender','Age','Attrition']]
Temp['Age'] = pd.cut(Temp['Age'], bins)
Temp = Temp.groupby(['Gender','Age','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Age'])
Temp['Age'] = Temp['Age'].astype(str)
PlotX1(df = Temp, Feat = 'Age')
del Temp
#
bins = pd.IntervalIndex.from_tuples([(0, 25), (25, 40), (40, 45),(45, 60)])
Temp = Data[['Education','Age','Attrition']]
Temp['Age'] = pd.cut(Temp['Age'], bins)
Temp = Temp.groupby(['Education','Age','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code','Age'], inplace = True)
Temp['Age'] = Temp['Age'].astype(str)
PlotX2(df = Temp, Feat ='Age', yLim = [0, 20])
del Temp
Temp = Data.groupby(['Gender','Business Travel','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Business Travel'])
PlotX1(df = Temp, Feat = 'Business Travel', yLim = [0, 40], H = 500, titleY = 0.90)
del Temp
Temp = Data.groupby(['Business Travel','Education','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Business Travel'])
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
PlotX2(df = Temp, Feat = 'Business Travel', yLim = [0, 25], H = 500, titleY = 0.90)
del Temp
Feat = 'Department'
Temp = Data.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotY1(df = Temp, Feat = Feat, xLim = [0, 40], H = 550, titleY = 0.90)
del Temp
Temp = Data.groupby([Feat,'Education','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 25])
del Temp, Feat
Feat = 'Distance From Home'
DistPlot(Feat, yLim = [0, 250], H = 450, titleY = 0.92)
bins = pd.IntervalIndex.from_tuples([(0, 5), (5, 10), (10, 20),(20, 30)])
Temp = Data[['Gender','Distance From Home','Attrition']]
Temp['Distance From Home'] = pd.cut(Temp['Distance From Home'], bins)
Temp = Temp.groupby(['Gender','Distance From Home','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Distance From Home'])
Temp['Distance From Home'] = Temp['Distance From Home'].astype(str)
PlotX1(df = Temp, Feat = Feat, yLim = [0, 25])
del Temp
bins = pd.IntervalIndex.from_tuples([(0, 5), (5, 10), (10, 20),(20, 30)])
Temp = Data[['Education','Distance From Home','Attrition']]
Temp['Distance From Home'] = pd.cut(Temp['Distance From Home'], bins)
Temp = Temp.groupby(['Education','Distance From Home','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Distance From Home'])
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp['Distance From Home'] = Temp['Distance From Home'].astype(str)
PlotX2(df = Temp, Feat = Feat, yLim = [0, 14])
del Temp, Feat
Temp = Data[['Gender','Education','Attrition']]
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp.groupby(['Gender','Education','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Education'])
PlotX1(df = Temp, Feat = 'Education', yLim = [0, 25])
del Temp
Feat = 'Education Field'
Temp = Data.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 25])
del Temp
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
Temp[Feat] = Temp[Feat].map(lambda x: x.replace('&','and').replace(' ','\n'))
PlotX2(df = Temp, Feat = Feat, yLim = [0, 14])
del Temp, Feat
Feat = 'Environment Satisfaction'
Temp = Data[['Gender',Feat,'Attrition']]
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 20])
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 12])
Feat = 'Hourly Rate'
DistPlot(Feat, yLim = [0, 140], H = 500, titleY = 0.92)
bins = pd.IntervalIndex.from_tuples([(25, 50), (50, 75), (75, 100)])
Temp = Data[['Gender',Feat,'Attrition']]
Temp[Feat] = pd.cut(Temp[Feat], bins)
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
Temp[Feat] = Temp[Feat].astype(str)
PlotX1(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
bins = pd.IntervalIndex.from_tuples([(25, 50), (50, 75), (75, 100)])
Temp = Data[['Education',Feat,'Attrition']]
Temp[Feat] = pd.cut(Temp[Feat], bins)
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
Temp[Feat] = Temp[Feat].astype(str)
PlotX2(df = Temp, Feat = Feat, yLim = [0, 14], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Job Involvement'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Job Level'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Job Role'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 8], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Job Satisfaction'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 10], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Marital Status'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 25], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 16], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Number Of Companies Worked'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Over Time'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 40], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Percent Salary Hike'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 10], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 6], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Performance Rating'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 50], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Relationship Satisfaction'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Stock Option Level'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 25], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 16], H = 500, titleY = 0.90)
del Temp, Feat
Feat = 'Total Working Years'
DistPlot(Feat = Feat, yLim = [0, 250], H = 450, titleY = 0.92)
bins = pd.IntervalIndex.from_tuples([(-1e-10, 10), (10, 20), (20, 30), (30, 40)])
Temp = Data[['Gender','Total Working Years','Attrition']]
Temp['Total Working Years'] = pd.cut(Temp['Total Working Years'], bins)
Temp = Temp.groupby(['Gender','Total Working Years','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Total Working Years'])
Temp['Total Working Years'] = Temp['Total Working Years'].astype(str)
PlotX1(df = Temp, Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
bins = pd.IntervalIndex.from_tuples([(-1e-10, 10), (10, 20), (20, 30), (30, 40)])
Temp = Data[['Education','Total Working Years','Attrition']]
Temp['Total Working Years'] = pd.cut(Temp['Total Working Years'], bins)
Temp = Temp.groupby(['Education','Total Working Years','Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Total Working Years'])
Temp['Total Working Years'] = Temp['Total Working Years'].astype(str)
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
DistPlot(Feat = 'Training Times Last Year', yLim = [0, 600], H = 450, titleY = 0.92)
Feat = 'Work Life Balance'
Temp = Data[['Gender',Feat,'Attrition']]
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
Temp = Temp.groupby(['Gender',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=[Feat])
PlotX1(df = Temp, Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
Temp = Data[['Education',Feat,'Attrition']]
Temp = Temp.groupby(['Education',Feat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp['Education Code'] = Temp['Education']
Temp['Education'] = Temp['Education'].replace(Categorical_Dict['Education'])
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Education Code',Feat], inplace = True)
if Feat in Categorical_Dict.keys():
Temp[Feat] = Temp[Feat].replace(Categorical_Dict[Feat])
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
del Temp, Feat
DistPlot(Feat = 'Years At Company', yLim = [0, 200], H = 450, titleY = 0.92)
DistPlot(Feat = 'Years In Current Role', yLim = [0, 400], H = 450, titleY = 0.92)
DistPlot(Feat = 'Years Since Last Promotion', yLim = [0, 600], H = 450, titleY = 0.92)
DistPlot(Feat = 'Years With Current Manager', yLim = [0, 400], H = 450, titleY = 0.92)